欢迎关注“小丫画图”公众号,回复“小白”,看小视频,实现点鼠标跑代码。

小丫微信: epigenomics E-mail:

作者:胡二强

小丫编辑校验

需求描述

enrichr的优点是批量获得各个数据库里的富集结果。如果用clusterprofiler做富集分析,也能一次性把各数据库的结果输出一个报告,就会很方便。

输入一个基因列表(gene symbol和foldchange)和msigdb里的所有gmt文件,用clusterprofiler做富集分析,输出表格和barplot。

出自http://www.docin.com/p-2008991503.html

应用场景

富集分析用哪个数据库做注释?

读paper找答案。有的文章用GO,有的用KEGG,有的用hallmark,有的从文献中收集基因集。

哪个适合我?挨个尝试太麻烦,批量全都跑出来。

用上调/下调基因分别做富集分析(ORA)?还是用GSEA跑全部基因?

两种方法 X 所有数据库,都跑出来看看。

哪个容易讲故事,最后就选哪个方法、哪个数据库。然后继续做深入研究。后续展示方式可参考https://k.koudai.com/sm2p0xYn

环境设置

使用国内镜像安装包

options("repos"= c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options(BioC_mirror="http://mirrors.tuna.tsinghua.edu.cn/bioconductor/")
install.packages("msigdbr")
install.packages("shadowtext")
install.packages("ggstance")

加载包

library(clusterProfiler)
## 
## clusterProfiler v3.16.1  For help: https://guangchuangyu.github.io/software/clusterProfiler
## 
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:stats':
## 
##     filter
library(msigdbr)
library(DOSE)
## DOSE v3.14.0  For help: https://guangchuangyu.github.io/software/DOSE
## 
## If you use DOSE in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
library(enrichplot)
library(ggplot2)
library(plyr)
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:clusterProfiler':
## 
##     arrange, mutate, rename, summarise
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Sys.setenv(LANGUAGE = "en") #显示英文报错信息
options(stringsAsFactors = FALSE) #禁止chr转成factor

自定义函数

## 数据处理函数
merge_result2 <- function(enrichResultList, output = "compareClusterResult") {
    if ( !is(enrichResultList, "list")) {
        stop("input should be a name list...")
    }
    if ( is.null(names(enrichResultList))) {
        stop("input should be a name list...")
    }
    x <- lapply(enrichResultList, as.data.frame)
    names(x) <- names(enrichResultList)
    y <- ldply(x, "rbind")   

    
    if (output == "compareClusterResult") {
        y <- plyr::rename(y, c(.id="Cluster"))
        y$Cluster = factor(y$Cluster, levels=names(enrichResultList))
        return(new("compareClusterResult",
            compareClusterResult = y))
    } 
    
    y <- plyr::rename(y, c(.id="Category"))
    if (output == "enrichResult") {
        return(new("enrichResult",
            result = y))        
    }
       
    if (output == "gseaResult") {
        return(new("gseaResult",
            result = y))        
    }   
}

keep_category <- function(em_ORA, n) {
    table_em <- as.numeric(table(em_ORA$Category))
    start <- rep(0, length(table_em) - 1)
    for(i in seq_len(length(table_em) - 1)) {
        start[i] <- sum(table_em[seq_len(i)])   
    }
    showCategorys <- sapply(table_em, function(x) min(n, x))
    start <- c(0, start) + 1
    end <- start + showCategorys - 1
    keep <- NULL
    for(i in seq_len(length(start))) {
        keep <- c(keep, c(start[i] : end[i]))
    } 
    return(keep)
}


enrich_filter <- function(em_result, showCategory) {
    keep <- keep_category(em_result, showCategory)
    em_result <- em_result[keep, ]
    if ("NES" %in% colnames(em_result))
        em_result$Count <- em_result$core_enrichment %>% 
            strsplit(split = "/")  %>%  
            vapply(length, FUN.VALUE = 1)
    return(em_result)
}

## 作图函数
em_plot <- function(em_1 = NULL, em_2 = NULL, showCategory = 2, fill = "p.adjust", hjust = 1) {

    fill <- match.arg(fill, c("Category", "p.adjust", "log10_p.adjust"))
    result1 <- enrich_filter(em_1, showCategory)     
    if (is.null(em_2)) { 
        result <- result1 
    } else {
        result2 <- enrich_filter(em_2, showCategory) 
        result2$Count <- -result2$Count          
        result <- rbind(result1, result2)
    }
    result$Category <- gsub("\n.*", "", result$Category)     
    result$log10_p.adjust <- log10(result$p.adjust)
    
    data_plot <- result[, c("ID", "Category", "p.adjust", "log10_p.adjust", "Count")]
    data_plot2 <- data_plot
    data_plot2$ID <- factor(data_plot2$ID, levels = unique(data_plot2$ID))
    data_plot2 <- plyr::rename(data_plot2, c("Count" = "gene_number"))
    h_just <- ifelse(data_plot2$gene_number < 0, -hjust, hjust)
    ggplot(data_plot2, aes_string(x = "gene_number", y = "ID", fill = fill)) + 
        geom_col() +       
        geom_text(aes_(x =~ gene_number + h_just, label =~ abs(gene_number)), 
            color="black") + 
        scale_x_continuous(label = abs,
                           expand = expansion(mult = c(.01, .01))) + #两侧留空
        theme_classic() + 
        ylab("") + 
        theme(axis.title.x = element_text(size = 15)) +     
        facet_grid(Category ~ ., scales="free", space="free") 
}

注释的准备

三种方法,根据自己的需要任选其一,或组合使用:

方法一:使用R包msigdbr里的注释

最简单常用

# 查看msigdbr所支持的物种
# 这里没有我要的物种怎么办?看方法三
msigdbr_species()
## # A tibble: 11 x 2
##    species_name             species_common_name      
##    <chr>                    <chr>                    
##  1 Bos taurus               cattle                   
##  2 Caenorhabditis elegans   roundworm                
##  3 Canis lupus familiaris   dog                      
##  4 Danio rerio              zebrafish                
##  5 Drosophila melanogaster  fruit fly                
##  6 Gallus gallus            chicken                  
##  7 Homo sapiens             human                    
##  8 Mus musculus             house mouse              
##  9 Rattus norvegicus        Norway rat               
## 10 Saccharomyces cerevisiae baker's or brewer's yeast
## 11 Sus scrofa               pig
# 以人为例
gmt <- msigdbr(species = "Homo sapiens") 
gmt2 <- gmt%>%
   dplyr::select(gs_name, entrez_gene)
gmts <- split(gmt2, gmt$gs_cat)

方法二:使用本地gmt文件

  • 可以从MSigDB下载自己想要分析的注释对应的gmt文件,保存到当前文件夹。

  • 做单细胞的小伙伴,可以用FigureYa194pySCENIC生成的regulon的gmt文件,进一步在更多数据集里深入挖掘,例如TCGA数据。

# 读取当前文件夹内的所有gmt文件
files = list.files(pattern="*.gmt")
gmts <- setNames(lapply(files, read.gmt), gsub(".v7.2.entrez.gmt", "", files))

方法三:自己定义注释文件

如果你研究的领域比较新或小众,那么现有的注释文件可能不能满足你的分析需求。这时往往会自己总结一个基因集:

  • 查文献:你对某篇文章发现的某些基因感兴趣,例如参与某个生物学过程的几十上百个基因;
  • 序列比对:如果你的目标物种现有的注释太少,可以跟模式生物做序列比对,用相似序列的同源基因来注释目标物种的基因。

我们首先来看一下注释文件gmts的结构:

## 类型为list
class(gmts)
## [1] "list"
## list中的每一项都是一个数据框,数据框有两列,第一列是term名称,第二列是基因。
head(gmts[[1]])
## # A tibble: 6 x 2
##   gs_name  entrez_gene
##   <chr>          <int>
## 1 chr10p11       26983
## 2 chr10p11      399746
## 3 chr10p11   100288319
## 4 chr10p11   100420618
## 5 chr10p11       91074
## 6 chr10p11       94134

可以看到,注释数据为一个list,list中的每一项都是一个数据框,代表来源于一个category的注释数据。每个数据框包含两列,第一列为term(如通路/功能/感兴趣的基因集),第二列为gene id(或gene symbol等)。

当我们从某篇文献中看到一个感兴趣的基因集,希望将它一起放入到已有的注释数据中一起做富集时,可以进行如下操作:

(1)将这些感兴趣的基因集处理成annotation.txt所示的格式,此文件有两列,第一列是用户自定义的term name,第二列是此term中对应的基因。

(2)得到所研究物种的背景基因,它可以是该物种的所有基因,也可以是表达谱中所有具有表达值的基因,如background.txt

(3)将新的注释信息与方法一或二的注释合并:

anno <- read.table("annotation.txt", sep="\t", header = T)
background <- read.table("background.txt", sep="\t", header = F)
background <- setdiff(background[,1], anno$entrez_gene)
bg_anno <- data.frame(gs_name = rep("background", length(background)), entrez_gene = background)
anno_data <- rbind(anno, bg_anno)
gmts$my_anno <- as_tibble(anno_data)

GSEA

输入文件

easy_input_limma.csv,差异表达分析结果。是FigureYa59Plus_GEO2DEG的输出文件,可无缝对接。

需要两列:基因名和Foldchange(或其他有意义的连续变量,例如ChIP/ATAC-seq的peak分数)。

# 加载差异表达分析结果
gsym.fc <- read.csv("easy_input_limma.csv")
colnames(gsym.fc)[1] <- "SYMBOL"
gsym.fc[1:3,]
##   SYMBOL    logFC  AveExpr        t      P.Value    adj.P.Val        B
## 1  KLK10 8.778049 10.50675 111.5755 3.802229e-11 1.760315e-07 15.41764
## 2  FXYD3 7.748971 10.45958 107.2722 4.809044e-11 1.760315e-07 15.29791
## 3   KLK7 9.138924 10.59638 102.6583 6.252983e-11 1.760315e-07 15.15779
#把gene symbol转换为ENTREZ ID
#此处物种是人,其他物种的ID转换方法,请参考FigureYa52GOplot
gsym.id <- bitr(gsym.fc$SYMBOL, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
## Loading required package: org.Hs.eg.db
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:plyr':
## 
##     rename
## The following object is masked from 'package:clusterProfiler':
## 
##     rename
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:plyr':
## 
##     desc
## The following object is masked from 'package:clusterProfiler':
## 
##     slice
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:clusterProfiler':
## 
##     select
## 
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(gsym.fc$SYMBOL, fromType = "SYMBOL", toType = "ENTREZID", :
## 21.42% of input gene IDs are fail to map...
#让基因名、ENTREZID、foldchange对应起来
gsym.fc.id <- merge(gsym.fc, gsym.id, by="SYMBOL", all=F)

#按照foldchange排序
gsym.fc.id.sorted <- gsym.fc.id[order(gsym.fc.id$logFC, decreasing = T),]

#获得ENTREZID、foldchange列表,做为GSEA的输入
id.fc <- gsym.fc.id.sorted$logFC
names(id.fc) <- gsym.fc.id.sorted$ENTREZID

GSEA富集分析

查看clusterProfiler的最新用法,请移步Y叔的在线电子书:https://yulab-smu.top/clusterProfiler-book/

# 自定义函数
gsea_func <- function(x, genelist, readable = FALSE) {
    GSEA_result <- GSEA(genelist, TERM2GENE = x, eps = 0)
    if (readable & nrow(GSEA_result) > 0) 
        GSEA_result <- setReadable(GSEA_result, 'org.Hs.eg.db', #物种
                    'ENTREZID') #转回gene symbol
    return(GSEA_result)
}

em <- setNames(lapply(gmts, gsea_func, id.fc, readable = TRUE), names(gmts)) %>% 
    merge_result2(output = "gseaResult")
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## no term enriched under specific pvalueCutoff...
# 把富集分析结果输出到文件
write.csv(em@result, "output_GSEA.csv", quote = F)

你可能喜欢打开output_GSEA.csv,总揽各种注释数据库的富集结果,或者画柱状图来看。

用自定义函数画柱状图

画柱状图来总揽各个注释数据库的富集分析结果,更直观。

显著的term非常多,不好全部在图中画出来,可以用showCategory参数调整每种注释画出的term的个数。

默认情况下条形图的颜色代表此term校正后的P值,可以通过修改fill参数来映射其他变量,如“Category”, “p.adjust” 和 “log10_p.adjust”。

参数hjust表示gene number的label距离bar顶端的距离,默认为1。

em_plot(em, showCategory = 5, fill = "log10_p.adjust", hjust = 5)

em_plot(em, showCategory = 5, fill = "Category", hjust = 5) + scale_fill_brewer(palette="Set1")

ggsave("batchGSEA.pdf", width = 12, height = 12)

NES有正负之分,可以把负的画到左侧,正的画到右侧。

em_GSEA1 <- em_GSEA2 <- em
res <- em@result
em_GSEA1@result <- res[which(res$NES > 0), ] # 被激活的通路
em_GSEA2@result <- res[which(res$NES < 0), ] # 被抑制的通路

em_plot(em_GSEA1, em_GSEA2, showCategory = 2, fill = "log10_p.adjust", hjust = 10)

em_plot(em_GSEA1, em_GSEA2, showCategory = 2, fill = "Category", hjust = 12) + scale_fill_brewer(palette="Set1")

ggsave("batchGSEA_sep.pdf", width = 12, height = 12)

确定文章里用哪个注释数据库做富集分析

接下来,结合你对自己研究领域的认识,提取比较理想的数据集的富集结果,作为文件supplementary file。进而DIY各种富集分析结果图,例如FigureYa11bubble那种泡泡图。

这里提取Catagory列为H的hallmark注释:

# 自定义函数
result_output <- function(em_data, category) {
    em_data <- as.data.frame(em_data)
    file_out <- em_data[which(em_data$Category == category), ]
}

file_out <- result_output(em, "H") # 提取Catagory列为`H`的结果
write.csv(file_out, "output_GSEA_H.csv", quote = F)

差异基因跟我感兴趣的通路有关吗?

你可能不在乎用哪种注释,只想知道差异表达基因在我感兴趣的生物学过程有没有富集。

我们从所有注释中搜索关键词,例如CHECKPOINT,发现在3个注释中都有cell cycle checkpoints,说明你获取差异表达基因的那种处理条件/基因突变可能影响了细胞周期

get_term <- function(em_data, term){
    ids <- grep(term, as.data.frame(em_data)$ID)
    em_data[ids, ]
}

file_out <- get_term(em@result, "CHECKPOINT")
DT::datatable(file_out, escape = F, rownames=F)

ORA

输入文件

其实只需要一个基因名文件。如果你已经拿到基因名easy_input_up.txteasy_input_down.txt(至少要有一个文件)就可以跳过这步,直接进入“ORA富集分析”。

考虑到差异基因的筛选参数决定了筛出哪些差异表达基因,进而影响ORA的结果。因此,这里从差异表达基因开始,一气呵成,方便调整筛基因的参数。

easy_input_limma.csv,同上,差异表达分析结果。是FigureYa59Plus_GEO2DEG的输出文件,可无缝对接

# 加载差异表达分析结果
gsym.fc <- read.csv("easy_input_limma.csv")
colnames(gsym.fc)[1] <- "SYMBOL"
gsym.fc[1:3,]
##   SYMBOL    logFC  AveExpr        t      P.Value    adj.P.Val        B
## 1  KLK10 8.778049 10.50675 111.5755 3.802229e-11 1.760315e-07 15.41764
## 2  FXYD3 7.748971 10.45958 107.2722 4.809044e-11 1.760315e-07 15.29791
## 3   KLK7 9.138924 10.59638 102.6583 6.252983e-11 1.760315e-07 15.15779
# cutoff
logFCcut <- 1 #log2-foldchange
adjPcut <- 0.05 #adj.P.value

# 筛选差异基因
geneup <- gsym.fc[gsym.fc$logFC > logFCcut & gsym.fc$adj.P.Val < adjPcut, ]$SYMBOL
genedown <- gsym.fc[gsym.fc$logFC < -logFCcut & gsym.fc$adj.P.Val < adjPcut, ]$SYMBOL
# 数量
length(geneup)
## [1] 946
length(genedown)
## [1] 986
# 保存到文件,便于套用格式
write.table(geneup, "easy_input_up.txt", quote = F, row.names = F, col.names = F)
write.table(genedown, "easy_input_down.txt", quote = F, row.names = F, col.names = F)

ORA富集分析

# 自定义函数
enrich_func <- function(x, gene, readable = FALSE) {
    en_result <- enricher(gene, TERM2GENE = x)
    if (readable & nrow(en_result) > 0) 
        en_result <- setReadable(en_result, 'org.Hs.eg.db', #物种
                    'ENTREZID')
                    
    return(en_result)
}

# 先以上调表达的基因为例
# 加载基因名
geneup <- read.table("easy_input_up.txt")$V1

# 把gene symbol转换为ENTREZ ID
# 此处物种是人,其他物种的ID转换方法,请参考FigureYa52GOplot
geneup.id <- bitr(geneup, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(geneup, fromType = "SYMBOL", toType = "ENTREZID", OrgDb =
## "org.Hs.eg.db"): 13.07% of input gene IDs are fail to map...
# 富集分析
em_ORAup <- setNames(lapply(gmts, enrich_func, geneup.id$ENTREZID, readable = TRUE), names(gmts)) %>%
    merge_result2(output = "enrichResult")

# 输出到文件
write.csv(em_ORAup@result, "output_ORA_up.csv", quote = F) 

下调表达的基因也是一样的操作

genedown <- read.table("easy_input_down.txt")$V1
genedown.id <- bitr(genedown, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(genedown, fromType = "SYMBOL", toType = "ENTREZID", OrgDb =
## "org.Hs.eg.db"): 15.4% of input gene IDs are fail to map...
em_ORAdown <- setNames(lapply(gmts, enrich_func, genedown.id$ENTREZID, readable = TRUE), names(gmts)) %>%
    merge_result2(output = "enrichResult")

或者你想把上下调的放一起也看看

genesupdown <- union(geneup.id$ENTREZID, genedown.id$ENTREZID)

em_ORA <- setNames(lapply(gmts, enrich_func, genesupdown, readable = TRUE), names(gmts)) %>%
    merge_result2(output = "enrichResult") 

用自定义函数画柱状图

# 上调表达基因的富集结果
em_plot(em_ORAup, showCategory = 5, fill = "Category", hjust = 10) + scale_fill_brewer(palette="Set1")

# 下调表达基因的富集结果
em_plot(em_ORAdown, showCategory = 5, fill = "Category", hjust = 5) + scale_fill_brewer(palette="Set1")

# 上调的画在右侧,下调的画在左侧
em_plot(em_ORAup, em_ORAdown, showCategory = 5, fill = "Category", hjust = 12) + scale_fill_brewer(palette="Set1")
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors

用enrichplot包里的函数画图

带你顺利衔接enrichplot

# 每种注释筛选3个term
keep1 <- keep_category(em_ORAup, 3)
keep2 <- keep_category(em_ORAdown, 3)
em_ora1 <- em_ORAup[keep1, ]
em_ora2 <- em_ORAdown[keep2, ]

# 上调画右边,下调画左边
em_ora2$Count <- -em_ora2$Count 
em_ora <- new("enrichResult",
            result = rbind(em_ora1, em_ora2))

# 画barplot
#barplot(em_ora, showCategory = nrow(em_ora)) + 
#    scale_x_continuous(label = abs) + 
#    facet_grid(Category ~ ., scales="free",space="free")

# 画dotplot
dotplot(em_ora, showCategory = nrow(em_ora)) + 
    scale_x_continuous(label = abs) + 
    facet_grid(Category ~ ., scales="free",space="free")

确定文章里用哪个注释数据库做富集分析

同上,提取Catagory列为H的hallmark注释:

file_out <- result_output(em_ORA, "H") # 上下调一起
write.csv(file_out, "output_ORA_H.csv", quote = F)

file_out <- result_output(em_ORAup, "H") #上调表达基因
write.csv(file_out, "output_ORAup_H.csv", quote = F)

file_out <- result_output(em_ORAdown, "H") # 下调表达基因
write.csv(file_out, "output_ORAdown_H.csv", quote = F)

跟前面做GSEA输出的output_GSEA_H.csv文件对比看看,大同小异

差异基因跟我感兴趣的通路有关吗?

同上,从所有注释中搜索CHECKPOINT,发现下调表达的基因在多个注释中都有cell cycle checkpoints,说明你获取差异表达基因的那种处理条件/基因突变可能抑制了cell cycle checkpoints

# 提取带有`CHECKPOINT`的富集结果
file_out <- get_term(em_ORAup@result, "CHECKPOINT")
DT::datatable(file_out, escape = F, rownames=F)
# 上调表达的基因没有富集到带checkpoint字样的term

file_out <- get_term(em_ORAdown@result, "CHECKPOINT")
DT::datatable(file_out, escape = F, rownames=F)
# 下调表达基因有多个富集term里有checkpoint字样

Session Info

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] zh_CN.UTF-8/zh_CN.UTF-8/zh_CN.UTF-8/C/zh_CN.UTF-8/zh_CN.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] org.Hs.eg.db_3.11.4    AnnotationDbi_1.50.3   IRanges_2.22.2        
##  [4] S4Vectors_0.26.1       Biobase_2.48.0         BiocGenerics_0.34.0   
##  [7] dplyr_1.0.2            plyr_1.8.6             ggplot2_3.3.2         
## [10] enrichplot_1.8.1       DOSE_3.14.0            msigdbr_7.2.1         
## [13] clusterProfiler_3.16.1
## 
## loaded via a namespace (and not attached):
##  [1] bit64_4.0.5         RColorBrewer_1.1-2  progress_1.2.2     
##  [4] httr_1.4.2          tools_4.0.2         DT_0.16            
##  [7] utf8_1.1.4          R6_2.5.0            DBI_1.1.0          
## [10] colorspace_2.0-0    withr_2.3.0         tidyselect_1.1.0   
## [13] gridExtra_2.3       prettyunits_1.1.1   bit_4.0.4          
## [16] compiler_4.0.2      cli_2.2.0           scatterpie_0.1.5   
## [19] xml2_1.3.2          labeling_0.4.2      triebeard_0.3.0    
## [22] scales_1.1.1        ggridges_0.5.2      stringr_1.4.0      
## [25] digest_0.6.27       rmarkdown_2.5       pkgconfig_2.0.3    
## [28] htmltools_0.5.0     htmlwidgets_1.5.2   rlang_0.4.9        
## [31] RSQLite_2.2.1       gridGraphics_0.5-0  farver_2.0.3       
## [34] generics_0.1.0      jsonlite_1.7.1      crosstalk_1.1.0.1  
## [37] BiocParallel_1.22.0 GOSemSim_2.14.2     magrittr_2.0.1     
## [40] ggplotify_0.0.5     GO.db_3.11.4        Matrix_1.2-18      
## [43] fansi_0.4.1         Rcpp_1.0.5          munsell_0.5.0      
## [46] viridis_0.5.1       lifecycle_0.2.0     stringi_1.5.3      
## [49] yaml_2.2.1          ggraph_2.0.4        MASS_7.3-53        
## [52] qvalue_2.20.0       grid_4.0.2          blob_1.2.1         
## [55] ggrepel_0.8.2       DO.db_2.9           crayon_1.3.4       
## [58] lattice_0.20-41     graphlayouts_0.7.1  cowplot_1.1.0      
## [61] splines_4.0.2       hms_0.5.3           knitr_1.30         
## [64] pillar_1.4.7        fgsea_1.14.0        igraph_1.2.6       
## [67] reshape2_1.4.4      fastmatch_1.1-0     glue_1.4.2         
## [70] evaluate_0.14       downloader_0.4      data.table_1.13.2  
## [73] BiocManager_1.30.10 vctrs_0.3.5         tweenr_1.0.1       
## [76] urltools_1.7.3      gtable_0.3.0        purrr_0.3.4        
## [79] polyclip_1.10-0     tidyr_1.1.2         assertthat_0.2.1   
## [82] xfun_0.19           ggforce_0.3.2       europepmc_0.4      
## [85] tidygraph_1.2.0     viridisLite_0.3.0   tibble_3.0.4       
## [88] rvcheck_0.1.8       memoise_1.1.0       ellipsis_0.3.1